import argparse
import os
import json
import ast
from prompt.guiActPrompt import GUIACTLOWACTIONPREDICTPROMPT_FOROSATLAS, GUIACT_FORUITARS
from tqdm import tqdm
import sys
import re
sys.path.append("./")
from utils.logging_utils import setup_logger_to_stdout
from utils.schema.GUI_OWL.common import pil_to_base64, message_translate
from preprocess_base import BasePreProcess
logger = setup_logger_to_stdout()


def parse_args(args=None, namespace=None):
    parser = argparse.ArgumentParser(description='Origin Dataset To Json')
    parser.add_argument('--dataset_name', type=str, default="GUIAct",
                        help='dataset name')
    parser.add_argument('--dataset_type', type=str, default='mobile', help='dataset type')
    parser.add_argument('--dataset_path', type=str, default="/data3/cpz/datasets/GUIAct/",
                        help='dataset path')
    parser.add_argument('--model_name', type=str, default="GUI_OWL",
                        help='model name')
    parser.add_argument('--save_path', type=str, default="/Agent_ScanKit/datasets/json",
                        help='save path')
    return parser.parse_args()

class GUIActPreProcess(BasePreProcess):
    def __init__(self, dataset_type, dataset_path, dataset_name, save_path, model_name):
        super().__init__(dataset_path, dataset_name, save_path, model_name)
        self.dataset_type = dataset_type
        if self.dataset_type == 'mobile':
            self.dataset_path = os.path.join(dataset_path, "smartphone_test_data.json")
            self.image_path = os.path.join(dataset_path, "smartphone_test_images")
        elif self.dataset_type == 'web_multi':
            self.dataset_path = os.path.join(dataset_path, "web-multi_test_data.json")
            self.image_path = os.path.join(dataset_path, "web-multi_test_images")
        else:
            self.dataset_path = os.path.join(dataset_path, "web-single_test_data.json")
            self.image_path = os.path.join(dataset_path, "web-single_test_images")
        self.dataset_name = dataset_name
        self.model_name = model_name

    def get_direction(self, action):
        x1, y1 = self._extract_coordinates(action['from'])
        x2, y2 = self._extract_coordinates(action['to'])
        dx = x2 - x1
        dy = y2 - y1
    
        if abs(dx) > abs(dy):
            return "right" if dx > 0 else "left"
        else:
            return "down" if dy > 0 else "up"

    def OS_ATLAS(self):
        sample = super().OS_ATLAS()
        def actionMapping(action):
            if self.dataset_type == 'mobile':
                action_type = action['name']
                if action_type == 'tap':
                    x, y = self._extract_coordinates(action['point']['related'])
                    x, y = x  * 1000, y  * 1000
                    return f"CLICK <point>[[{x}, {y}]]</point>" 
                elif action_type == 'swipe':
                    direction = self.get_direction(action['dual_point']['absolute'])
                    return f"SCROLL [{direction.upper()}]"
                elif action_type == 'input':
                    return f"TYPE [{action['text']}]"
                elif action_type == 'answer':
                    return "COMPLETE"
                elif action_type == 'enter':
                    return "ENTER"
            elif self.dataset_type == 'web_single':
                action_name = action[0]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[0]['element']['related'])
                    x, y = gt[0] * 1000, gt[1] * 1000
                    return f"CLICK <point>[[{x}, {y}]]</point>" 
                elif action_name == 'scroll':
                    absolute = action[0]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                return f"SCROLL [DOWN]"
                            else:
                                return f"SCROLL [UP]"
                elif action_name == 'answer':
                    return "COMPLETE"
            else:
                action_name = action[-1]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[-1]['element']['related'])
                    x, y = gt[0] * 1000, gt[1] * 1000
                    return f"CLICK <point>[[{x}, {y}]]</point>" 
                elif action_name == 'scroll':
                    absolute = action[-1]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                return f"SCROLL [UP]"
                            else:
                                return f"SCROLL [DOWN]"
                elif action_name == 'input':
                    return f"TYPE [{action[-1]['text']}]"
                elif action_name == 'answer':
                    return "COMPLETE"
                else:
                    return None

            
        groups = self._get_episode_data(self.readJson(self.dataset_path))
        data = []
        for eid, items in tqdm(groups.items()):
            tamp = []
            for idx, step in enumerate(items):
                from copy import deepcopy
                record = deepcopy(sample)
                record['images'] = [os.path.join(self.image_path, step['image_id']+'.png')]
                record['goal'] = step['question']
                record['image_size'] = [[step['image_size']['width'], step['image_size']['height']]]
                action = actionMapping(step['actions_label'])
                if action is None:
                    tamp = []
                    break
                record['label'] = "action:\n"+action
                record['messages'][1]['content'] = action
                if self.dataset_type == 'mobile':
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                elif self.dataset_type == 'web_single':
                    record['episode_id'] = step['uid'].split("_qa_")[0]
                    record['step_id'] = step['uid'].split("_qa_")[1]
                    if step['actions_label'][0]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][0]['element']['related'])[0]  
                        record['bbox'] = [item*1000 for item in bbox]
                    else: 
                        None
                else:
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                    if step['actions_label'][-1]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][-1]['element']['related'])[0]  
                        record['bbox'] = [item*1000 for item in bbox]
                    else: 
                        None
                record['messages'][0]['content'] = GUIACTLOWACTIONPREDICTPROMPT_FOROSATLAS.replace("{finalGoal}", record['goal'])
                record['messages'][0]['content'] = record['messages'][0]['content'].replace("{previousActions}", str(step['actions_history']))
                tamp.append(record)
            data.extend(tamp)
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+self.model_name.lower()+'.json'))
        logger.info("Finished")


    def UI_TARS(self):
        sample = super().UI_TARS()
        def actionMapping(action, image_size):
            if self.dataset_type == 'mobile':
                action_type = action['name']
                if action_type == 'tap':
                    x, y = self._extract_coordinates(action['point']['absolute'])
                    if "1.5" in self.model_name:
                        return f"click(start_box='({int(x)},{int(y)})')" 
                    return f"click(start_box='({int(x/image_size[0]*1000)},{int(y/image_size[1]*1000)})')" 
                elif action_type == 'swipe':
                    direction = self.get_direction(action['dual_point']['absolute'])
                    return f"scroll(direction='{direction}')"
                elif action_type == 'input':
                    return f"type(content='{action['text']}')"
                elif action_type == 'answer':
                    return "finished()"
                elif action_type == 'enter':
                    return f"enter()"
            elif self.dataset_type == 'web_single':
                action_name = action[0]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[0]['element']['related'])
                    if "1.5" in self.model_name:
                        return f"click(start_box='({int(gt[0]*image_size[0])}, {int(gt[1]*image_size[1])})')" 
                    return f"click(start_box='({int(gt[0]*1000)},{int(gt[1]*1000)})')" 
                elif action_name == 'scroll':
                    absolute = action[0]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                return f"scroll(direction='down')"
                            else:
                                return f"scroll(direction='up')"
                elif action_name == 'answer':
                    return "finished()"
            else:
                action_name = action[-1]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[-1]['element']['related'])
                    if "1.5" in self.model_name:
                        return f"click(start_box='({int(gt[0]*image_size[0])}, {int(gt[1]*image_size[1])})')" 
                    return f"click(start_box='({int(gt[0]*1000)},{int(gt[1]*1000)})')" 
                elif action_name == 'scroll':
                    absolute = action[-1]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                return f"scroll(direction='down')"
                            else:
                                return f"scroll(direction='up')"
                elif action_name == 'answer':
                    return "finished()"
                elif action_name == 'input':
                    return f"type(content='{action[-1]['text']}')"
                else:
                    return None
                            
                

        def build_history(index, metadata):
            history = []
            image_indices = range(0, index) if index <= 4 else range(index - 4, index)
            for i in range(len(metadata['screenshots'])):
                if i in image_indices:
                    image_history = {
                        "role": "user",
                        "content": [
                            {
                                "type": "image",
                                "image": metadata['screenshots'][i]
                            }
                        ]
                    }
                    history.append(image_history)
                if i in image_indices:
                    action = metadata["actions"][i]
                    thought = metadata['step_instructions'][i]
                    text_history = {
                        "role": "assistant",
                        "content": [
                            {"type": "text", "text": f"Thought: {thought}\nAction: {action}"}
                        ]
                    }
                    history.append(text_history)
            return history

        groups = self._get_episode_data(self.readJson(self.dataset_path))
        data = []
        for eid, items in tqdm(groups.items()):
            metadata = {}
            metadata['actions'] = [actionMapping(step['actions_label'], [step['image_size']['width'], step['image_size']['height']]) for step in items]
            metadata['image_size'] = [[step['image_size']['width'], step['image_size']['height']] for step in items]
            metadata['screenshots'] = [os.path.join(self.image_path, step['image_id']+'.png') for step in items]
            tamp = []
            for idx, step in enumerate(items):
                from copy import deepcopy
                record = deepcopy(sample)
                if None in metadata['actions']:
                    break
                if self.dataset_type == 'mobile':
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                    metadata['step_instructions'] = [line.split(":")[1].strip() for line in items[-1]['actions_history'].split("\n")]
                elif self.dataset_type == 'web_single':
                    record['episode_id'] = step['uid'].split("_qa_")[0]
                    record['step_id'] = step['uid'].split("_qa_")[1]
                    if step['actions_label'][0]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][0]['element']['related'])[0]
                        record['bbox'] = [item*1000 for item in bbox]
                    else: 
                        None
                else:
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                    if items[-1]['actions_history'] != "":
                        metadata['step_instructions'] = [line.split(": ")[1].split(',')[-1] for line in items[-1]['actions_history'].split("\n")]
                    if step['actions_label'][-1]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][-1]['element']['related'])[0]
                        record['bbox'] = [item*1000 for item in bbox]
                    else: 
                        None
                record['images'] = [os.path.join(self.image_path, step['image_id']+'.png')]
                record['goal'] = step['question']
                record['label'] = f"Thought: {''}\nAction: {metadata['actions'][idx]}"
                record['image_size'] = [[step['image_size']['width'], step['image_size']['height']]]
                record['messages'][1]['content'][0]['text'] = GUIACT_FORUITARS.replace("{instruction}", record['goal'])
                
                if idx != 0:
                    record['messages'].extend(build_history(idx, metadata)) 
                record['messages'].extend([
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "image",
                                "image": record['images'][0]
                            }
                        ]
                    }
                ]) 
                tamp.append(record)
            data.extend(tamp)
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+self.model_name.lower()+'.json'))
        logger.info("Finished")

    def Aguvis(self):
        sample = super().Aguvis()
        from utils.schema.aguvisConstants import user_instruction
        def actionMapping(action):
            if self.dataset_type == 'mobile':
                action_type = action['name']
                if action_type == 'tap':
                    x, y = self._extract_coordinates(action['point']['absolute'])
                    return f"assistantos\npyautogui.click(x={x/1000}, y={y/1000})"
                elif action_type == 'swipe':
                    direction = self.get_direction(action['dual_point']['absolute'])
                    if direction in ['left', 'right']:
                        if direction == 'left':
                            return "assistantos\npyautogui.hscroll(page=0.1)"
                        else:
                            return "assistantos\npyautogui.hscroll(page=-0.1)"
                    else:
                        if direction == 'up':
                            return "assistantos\npyautogui.scroll(page=-0.1)"
                        else:
                            return "assistantos\npyautogui.scroll(page=0.1)"
                elif action_type == 'input':
                    return f"assistantos\npyautogui.write(message='{action['text']}')"
                elif action_type == 'answer':
                    return "assistantos\nmobile.terminate(status='success')"
                elif action_type == 'enter':
                    return f"assistantos\npyautogui.press(key=['enter'])"
            elif self.dataset_type == 'web_single':
                action_name = action[0]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[0]['element']['related'])
                    return f"assistantos\npyautogui.click(x={gt[0]*1000}, y={gt[1]*1000})"
                elif action_name == 'scroll':
                    absolute = action[0]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                return "assistantos\npyautogui.scroll(page=-0.1)"
                            else:
                                return "assistantos\npyautogui.scroll(page=0.1)"
                elif action_name == 'answer':
                    return "assistantos\nmobile.terminate(status='success')"
            else:
                action_name = action[-1]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[-1]['element']['related'])
                    return f"assistantos\npyautogui.click(x={gt[0]*1000}, y={gt[1]*1000})"
                elif action_name == 'scroll':
                    absolute = action[-1]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                return "assistantos\npyautogui.scroll(page=-0.1)"
                            else:
                                return "assistantos\npyautogui.scroll(page=0.1)"
                elif action_name == 'answer':
                    return "assistantos\nmobile.terminate(status='success')"
                elif action_name == 'input':
                    return f"assistantos\npyautogui.write(message='{action[-1]['text']}')"
                else:
                    return None
                    
                
            
        groups = self._get_episode_data(self.readJson(self.dataset_path))
        data = []
        for eid, items in tqdm(groups.items()):
            tamp = []
            action_traslate = [actionMapping(step['actions_label']) for step in items]
            for idx, step in enumerate(items):
                from copy import deepcopy
                record = deepcopy(sample)
                if None in action_traslate:
                    break
                if self.dataset_type == 'mobile':
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                elif self.dataset_type == 'web_single':
                    record['episode_id'] = step['uid'].split("_qa_")[0]
                    record['step_id'] = step['uid'].split("_qa_")[1]
                    if step['actions_label'][0]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][0]['element']['related'])[0]
                        record['bbox'] = [item*1000 for item in bbox]
                    else: 
                        None
                else:
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                    if step['actions_label'][-1]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][-1]['element']['related'])[0]
                        record['bbox'] = [item*1000 for item in bbox]
                    else: 
                        None

                
                record['images'] = [os.path.join(self.image_path, step['image_id']+'.png')]
                record['goal'] = step['question']
                record['label'] = action_traslate[idx]
                record['image_size'] = [[step['image_size']['width'], step['image_size']['height']]]
                previous_actions = [step.replace("step", "Step") for i, step in enumerate(step['actions_history'].split('\n'))]
                record['messages']['content'][1]['text'] = user_instruction.format(overall_goal=record['goal'], previous_actions=previous_actions, low_level_instruction="")
                record['is_low_level_instruction'] = False 
                record['mode'] = 'force-plan' 
                tamp.append(record)    
            data.extend(tamp)

        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+self.model_name.lower()+'.json'))
        logger.info("Finished")
        
    def Agent_CPM(self):
        sample = super().Agent_CPM()
        def actionMapping(action):
            if self.dataset_type == 'mobile':
                action_type = action['name']
                if action_type == 'tap':
                    x, y = self._extract_coordinates(action['point']['absolute'])
                    return str({"thought":"", "POINT": [x, y]})
                elif action_type == 'swipe':
                    direction = self.get_direction(action['dual_point']['absolute'])
                    return str({"thought":"", "POINT": [-100, -100], "to": direction})
                elif action_type == 'input':
                    return str({"thought":"", "TYPE": action['text']})
                elif action_type == 'answer':
                    return str({"thought":"", 'STATUS': 'finish'})
                elif action_type == 'enter':
                    return str({"thought":"", 'PRESS': 'ENTER'})
            elif self.dataset_type == 'web_single':
                action_name = action[0]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[0]['element']['related'])
                    return str({"thought":"", "POINT": [gt[0]*1000, gt[1]*1000]})
                elif action_name == 'scroll':
                    absolute = action[0]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                return str({"thought":"", "POINT": [-100, -100], "to": "up"})
                            else:
                                return str({"thought":"", "POINT": [-100, -100], "to": "down"})
                elif action_name == 'answer':
                    return str({"thought":"", 'STATUS': 'finish'})
            else:
                action_name = action[-1]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[-1]['element']['related'])
                    return str({"thought":"", "POINT": [gt[0]*1000, gt[1]*1000]})
                elif action_name == 'scroll':
                    absolute = action[-1]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                return str({"thought":"", "POINT": [-100, -100], "to": "up"})
                            else:
                                return str({"thought":"", "POINT": [-100, -100], "to": "down"})
                elif action_name == 'answer':
                    return str({"thought":"", 'STATUS': 'finish'})
                elif action_name == 'input':
                    return str({"thought":"", "TYPE": action[-1]['text']})
                
        
        from prompt.androidControlPrompt import AGENT_CPM_SYSTEM_PROMPT
        ACTION_SCHEMA = json.load(open('/Agent_ScanKit/utils/schema/agentCPMSchema.json', encoding="utf-8"))
        items = list(ACTION_SCHEMA.items())
        insert_index = 3
        items.insert(insert_index, ("required", ["thought"])) 
        ACTION_SCHEMA = dict(items)
        AGENT_CPM_SYSTEM_PROMPT = AGENT_CPM_SYSTEM_PROMPT.replace("ACTION_SCHEMA", str(ACTION_SCHEMA))

        groups = self._get_episode_data(self.readJson(self.dataset_path))
        data = []
        for eid, items in tqdm(groups.items()):
            tamp = []
            action_traslate = [actionMapping(step['actions_label']) for step in items]
            for idx, step in enumerate(items):
                from copy import deepcopy
                record = deepcopy(sample)
                if None in action_traslate:
                    break
                if self.dataset_type == 'mobile':
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                elif self.dataset_type == 'web_single':
                    record['episode_id'] = step['uid'].split("_qa_")[0]
                    record['step_id'] = step['uid'].split("_qa_")[1]
                    if step['actions_label'][0]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][0]['element']['related'])[0]
                        record['bbox'] = [item*1000 for item in bbox]
                    else: 
                        None
                else:
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                    if step['actions_label'][0]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][-1]['element']['related'])[0]
                        record['bbox'] = [item*1000 for item in bbox]
                    else: 
                        None
                record['images'] = [os.path.join(self.image_path, step['image_id']+'.png')]
                record['goal'] = step['question']
                record['label'] = action_traslate[idx]
                record['image_size'] = [[step['image_size']['width'], step['image_size']['height']]]
                record['messages'][0]['content'][0] = record['messages'][0]['content'][0].replace("text_prompt", record['goal'])
                record['system_prompt'] = AGENT_CPM_SYSTEM_PROMPT
                tamp.append(record)
            data.extend(tamp)
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+self.model_name.lower()+'.json'))
        logger.info("Finished")

    def GUI_R1(self):
        sample = super().GUI_R1()
        def actionMapping(action):
            point = [-100, -100]
            input_text = 'no input text'
            if self.dataset_type == 'mobile':
                action_type = action['name']
                if action_type == 'tap':
                    x, y = self._extract_coordinates(action['point']['absolute'])
                    action_name = 'click'
                    point = [int(x), int(y)]
                elif action_type == 'swipe':
                    map_direction = {'left': 'right', 'right': 'left', 'up': 'down', 'down': 'up'}
                    direction = self.get_direction(action['dual_point']['absolute'])
                    input_text = map_direction[direction]
                    action_name = 'scroll'
                elif action_type == 'input':
                    action_name = 'type'
                    input_text = action['text']
                elif action_type == 'answer':
                    action_name = 'complete'
                elif action_type == 'enter':
                    action_name = 'enter'
            elif self.dataset_type == 'web_single':
                action_name = action[0]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[0]['element']['absolute'])
                    action_name = 'click'
                    point = [int(gt[0]), int(gt[1])]
                elif action_name == 'scroll':
                    absolute = action[0]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                input_text = 'up'
                            else:
                                input_text = 'down'
                elif action_name == 'answer':
                    action_name = 'complete'
            else:
                action_name = action[-1]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[-1]['element']['absolute'])
                    action_name = 'click'
                    point = [int(gt[0]), int(gt[1])]
                elif action_name == 'scroll':
                    absolute = action[-1]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                input_text = 'up'
                            else:
                                input_text = 'down'
                elif action_name == 'answer':
                    action_name = 'complete'
                elif action_name == 'input':
                    input_text = action[-1]['text']
                else:
                    return None
            formatted_action = [{
                'action': action_name,
                'point': point,
                'input_text': input_text
            }]
            return str(formatted_action)

          
        from prompt.guiActPrompt import GUIACT_FROGUIR1

        groups = self._get_episode_data(self.readJson(self.dataset_path))
        data = []
        for eid, items in tqdm(groups.items()):
            tamp = []
            action_traslate = [actionMapping(step['actions_label']) for step in items]
            for idx, step in enumerate(items):
                from copy import deepcopy
                record = deepcopy(sample)
                if None in action_traslate:
                    break
                if self.dataset_type == 'mobile':
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                elif self.dataset_type == 'web_single':
                    record['episode_id'] = step['uid'].split("_qa_")[0]
                    record['step_id'] = step['uid'].split("_qa_")[1]
                    if step['actions_label'][0]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][0]['element']['related'])[0]
                        record['bbox'] = [item * 1000 for item in bbox]
                    else: 
                        None
                else:
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                    if step['actions_label'][-1]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][-1]['element']['related'])[0]
                        record['bbox'] = [item * 1000 for item in bbox]
                    else: 
                        None
                record['images'] = [os.path.join(self.image_path, step['image_id']+'.png')]
                record['goal'] = step['question']
                record['label'] = "<think></think><answer>"+action_traslate[idx]+"</answer>"
                record['image_size'] = [[step['image_size']['width'], step['image_size']['height']]]
                record['messages'][0]['content'][0]['image'] = record['images'][0]        
                record['messages'][0]['content'][1]['text'] = '<image>\n' + GUIACT_FROGUIR1.replace("{goal}", record['goal'])
                record['messages'][0]['content'][1]['text'] = record['messages'][0]['content'][1]['text'].replace("{history}", str(step['actions_history']))
                tamp.append(record)
            data.extend(tamp)
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+self.model_name.lower()+'.json'))
        logger.info("Finished")

    def OS_Genesis(self):
        sample = super().OS_Genesis()
        def actionMapping(action):
            if self.dataset_type == 'mobile':
                action_type = action['name']
                if action_type == 'tap':
                    x, y = self._extract_coordinates(action['point']['absolute'])
                    return f'Low-level thought: {" "} action: {{"action_type": "click", "x": {x}, "y": {y}}}'
                elif action_type == 'swipe':
                    direction = self.get_direction(action['dual_point']['absolute'])
                    return f'Low-level thought: {" "} action: {{"action_type": "scroll", "direction": "{direction}"}}'
                elif action_type == 'input':
                    action_dict = {
                        "action_type": "type",
                        "text": action["text"],
                        "x": -100,
                        "y": -100
                    }
                    json_action = json.dumps(action_dict)
                    return f"Low-level thought: {' '} action: {json_action}"
                elif action_type == 'answer':
                    return f"Low-level thought: {' '} action: {{'action_type': 'stop'}}"
                elif action_type == 'enter':
                    return f'Low-level thought: {" "} action: {{"action_type": "enter"}}'
            elif self.dataset_type == 'web_single':
                action_name = action[0]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[0]['element']['absolute'])
                    return f'Low-level thought: {" "} action: {{"action_type": "click", "x": {gt[0]}, "y": {gt[1]}}}'
                elif action_name == 'scroll':
                    absolute = action[0]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                return f'Low-level thought: {" "} action: {{"action_type": "scroll", "direction": "up"}}'
                            else:
                                return f'Low-level thought: {" "} action: {{"action_type": "scroll", "direction": "down"}}'
                elif action_name == 'answer':
                    return f"Low-level thought: {' '} action: {{'action_type': 'stop'}}"
            else:
                action_name = action[-1]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[-1]['element']['absolute'])
                    return f'Low-level thought: {" "} action: {{"action_type": "click", "x": {gt[0]}, "y": {gt[1]}}}'
                elif action_name == 'scroll':
                    absolute = action[-1]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                return f'Low-level thought: {" "} action: {{"action_type": "scroll", "direction": "up"}}'
                            else:
                                return f'Low-level thought: {" "} action: {{"action_type": "scroll", "direction": "down"}}'
                elif action_name == 'answer':
                    return f"Low-level thought: {' '} action: {{'action_type': 'stop'}}"
                elif action_name == 'input':
                    action_dict = {
                        "action_type": "type",
                        "text": action[-1]["text"],
                        "x": -100,
                        "y": -100
                    }
                    json_action = json.dumps(action_dict)
                    return f"Low-level thought: {' '} action: {json_action}"
                else:
                    return None
                
        
        def get_a11_tree(step_id, ui_elements):
            clickable_nodes = {}
            if self.dataset_type == 'mobile':
                elements = ui_elements.loc[step_id].values[1]
                for element_item in elements:
                    pos = element_item['position']
                    x, y, w, h = pos['x'], pos['y'], pos['width'], pos['height']
                    cx, cy = x + w/2, y + h/2
                    if element_item['ui_type'] == 'text':
                        clickable_nodes[element_item['text']] = (cx, cy) 
                    else:
                        clickable_nodes[element_item['ui_type']] = (cx, cy) 
            elif self.dataset_type == 'web_single':
                elements = ui_elements.loc[step_id].values[1]
                for element_item in elements:
                    pos = element_item['rect']
                    x1, y1, x2, y2 = pos['left'], pos['top'], pos['right'], pos['bottom']
                    cx, cy = (x1+x2)/2, (y1 + y2)/2
                    if element_item['text'] is not None:
                        clickable_nodes[element_item['text']] = (cx, cy) 
                    else:
                        match = re.search(r'id\("([^"]+)"\)', element_item['xpath'])
                        if match:
                            clickable_nodes[element_item[match.group(1)]] = (cx, cy) 
            else:
                elements = ui_elements.loc[step_id].values[1]
                for element_item in elements:
                    pos = element_item['rect']
                    x1, y1, x2, y2 = pos['left'], pos['top'], pos['right'], pos['bottom']
                    cx, cy = (x1+x2)/2, (y1 + y2)/2
                    if element_item['text'] is not None:
                        clickable_nodes[element_item['text']] = (cx, cy) 
                    else:
                        clickable_nodes[element_item['type']] = (cx, cy) 
            return clickable_nodes
  
        from prompt.guiActPrompt import OS_GENESIS_PROMPT
        import pandas as pd
        groups = self._get_episode_data(self.readJson(self.dataset_path))
        ui_elements = pd.read_parquet(self.image_path+'.parquet')
        data = []
        for eid, items in tqdm(groups.items()):
            metadata = {}
            tamp = []
            metadata['actions'] = [actionMapping(step['actions_label']) for step in items]
            for idx, step in enumerate(items):
                from copy import deepcopy
                record = deepcopy(sample)
                if None in metadata['actions']:
                    break
                record['images'] = [os.path.join(self.image_path, step['image_id']+'.png')]
                record['goal'] = step['question']
                record['label'] = metadata['actions'][idx]
                record['image_size'] = [[step['image_size']['width'], step['image_size']['height']]]
                if self.dataset_type == 'mobile':
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                    record['question'] = OS_GENESIS_PROMPT.format(instruction=record['goal'], history=step['actions_history'], a11y_tree=str(get_a11_tree(step['uid'], ui_elements)))  
                elif self.dataset_type == 'web_single':
                    record['episode_id'] = step['uid'].split("_qa_")[0]
                    record['step_id'] = step['uid'].split("_qa_")[1]
                    if step['actions_label'][0]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][0]['element']['absolute'])[0]
                        record['bbox'] = bbox
                    else: 
                        None
                    record['question'] = OS_GENESIS_PROMPT.format(instruction=record['goal'], history=step['actions_history'], a11y_tree=str(get_a11_tree(record['episode_id'].split("_")[-1], ui_elements))) 
                else:
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                    if step['actions_label'][-1]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][-1]['element']['absolute'])[0]
                        record['bbox'] = bbox
                    else: 
                        None
                    record['question'] = OS_GENESIS_PROMPT.format(instruction=record['goal'], history=step['actions_history'], a11y_tree=str(get_a11_tree(step['uid'], ui_elements))) 
                tamp.append(record)
            data.extend(tamp)
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+self.model_name.lower()+'.json'))
        logger.info("Finished")

    def GUI_Odyssey(self):
        sample = super().GUI_Odyssey()
        def actionMapping(action):
            if self.dataset_type == 'mobile':
                action_type = action['name']
                if action_type == 'tap':
                    x, y = self._extract_coordinates(action['point']['related'])
                    return f"CLICK: ({int(x*1000)}, {int(y*1000)})"
                elif action_type == 'swipe':
                    direction = self.get_direction(action['dual_point']['absolute'])
                    return f"SCROLL: {direction.upper()}"
                elif action_type == 'input':
                    return f"TYPE: {action['text']}"
                elif action_type == 'answer':
                    return "COMPLETE"
                elif action_type == 'enter':
                    return "ENTER"
            elif self.dataset_type == 'web_single':
                action_name = action[0]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[0]['element']['absolute'])
                    return f"CLICK: ({int(gt[0])}, {int(gt[1])})"
                elif action_name == 'scroll':
                    absolute = action[0]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                return f"SCROLL: UP"
                            else:
                                return f"SCROLL: DOWN"
                elif action_name == 'answer':
                    return "COMPLETE"
            else:
                action_name = action[-1]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[-1]['element']['absolute'])
                    return f"CLICK: ({int(gt[0])}, {int(gt[1])})"
                elif action_name == 'scroll':
                    absolute = action[-1]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                return f"SCROLL: UP"
                            else:
                                return f"SCROLL: DOWN"
                elif action_name == 'answer':
                    return "COMPLETE"
                elif action_name == 'input':
                    return f"TYPE: {action[-1]['text']}"
            
        groups = self._get_episode_data(self.readJson(self.dataset_path))
        data = []
        his_index = {}
        for eid, items in tqdm(groups.items()):
            tamp = []
            action_traslate = [actionMapping(step['actions_label']) for step in items]
            previous_screenshot_history: list = []
            previous_action_history = items[-1]['actions_history'].split('\n')
            for idx, step in enumerate(items):
                from copy import deepcopy
                record = deepcopy(sample)
                if None in action_traslate:
                    break
                if self.dataset_type == 'mobile':
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                elif self.dataset_type == 'web_single':
                    record['episode_id'] = step['uid'].split("_qa_")[0]
                    record['step_id'] = step['uid'].split("_qa_")[1]
                    if step['actions_label'][0]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][0]['element']['related'])[0]
                        record['bbox'] = [item*1000 for item in bbox]
                    else: 
                        None
                else:
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                    if step['actions_label'][-1]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][-1]['element']['related'])[0]
                        record['bbox'] = [item*1000 for item in bbox]
                    else: 
                        None
                record['images'] = [os.path.join(self.image_path, step['image_id']+'.png')]
                his_index[f"{record['images'][0]}"] = previous_screenshot_history[:idx]
                record['goal'] = step['question']
                record['label'] = action_traslate[idx]
                record['image_size'] = [[step['image_size']['width'], step['image_size']['height']]]
                question = record['question'].format(
                    instruction=record['goal'], 
                    image_path=record['images'][0]
                )
                if idx > 0:
                    his_img = f'\nPrevious screenshots: <img>image-history: {record["images"][0]}</img>'
                    his_str = '\nPrevious Actions: '
                    for idx, hi in enumerate(previous_action_history[-4:]):
                        his_str += f"{idx+1}. {hi}\n"
                        question = f"{question}{his_img}{his_str}"
                else:
                    question += f'\nPrevious screenshots: None'
                    question += f'\nPrevious Actions: None'

                question += '\nProvide the command-style action directly.'
                record['question'] = question      
                previous_screenshot_history.append(record['images'][0])
                tamp.append(record)
            data.extend(tamp)
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+self.model_name.lower()+'.json'))
        self.saveJson(his_index, os.path.join("/Agent_ScanKit/utils/utils_odyssey", f"his_index.json"))
        logger.info("Finished")

    def GUI_OWL(self):
        build_system_messages, getResizedImage, build_user_messages, sample = super().GUI_OWL()
        def actionMapping(action):
            if self.dataset_type == 'mobile':
                action_type = action['name']
                if action_type == 'tap':
                    x, y = self._extract_coordinates(action['point']['related'])
                    return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "click", "coordinate": [{int(x*1000)}, {int(y*1000)}]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
                elif action_type == 'swipe':
                    x1, y1 = self._extract_coordinates(action['dual_point']['absolute']['from'])
                    x2, y2 = self._extract_coordinates(action['dual_point']['absolute']['to'])
                    return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "swipe", "coordinate": [{x1}, {y1}], "coordinate2": [{x2}, {y2}]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
                elif action_type == 'input':
                    return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "type", "text": "{action['text']}"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
                elif action_type == 'answer':
                    return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "terminate", "status": "success"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
                elif action_type == 'enter':
                    return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "system_button", "button": "Enter"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
            elif self.dataset_type == 'web_single':
                action_name = action[0]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[0]['element']['absolute'])
                    return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "click", "coordinate": [{int(gt[0])}, {int(gt[1])}]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
                elif action_name == 'scroll':
                    absolute = action[0]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                x1, y1 = 0, 0
                                x2, y2 = 0, 1
                            else:
                                x1, y1 = 0, 1
                                x2, y2 = 0, 0
                            return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "swipe", "coordinate": [{x1}, {y1}], "coordinate2": [{x2}, {y2}]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
                elif action_name == 'answer':
                    return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "terminate", "status": "success"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
            else:
                action_name = action[-1]['name']
                if action_name == 'click':
                    _, gt = self._extract_coordinates(action[-1]['element']['absolute'])
                    return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "click", "coordinate": [{int(gt[0])}, {int(gt[1])}]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
                elif action_name == 'scroll':
                    absolute = action[-1]["scroll"]["absolute"]
                    directions = {k: v for k, v in absolute.items() if v != 0}
                    for direction, value in directions.items():
                        if direction == 'down':
                            if value > 0:
                                x1, y1 = 0, 0
                                x2, y2 = 0, 1
                            else:
                                x1, y1 = 0, 1
                                x2, y2 = 0, 0
                            return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "swipe", "coordinate": [{x1}, {y1}], "coordinate2": [{x2}, {y2}]}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
                elif action_name == 'answer':
                    return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "terminate", "status": "success"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
                elif action_name == 'input':
                    return f"""<thinking>\n""\n</thinking>\n<tool_call>\n{{"name": "mobile_use", "arguments": {{"action": "type", "text": "{action[-1]['text']}"}}}}\n</tool_call>\n<conclusion>\n""\n</conclusion>"""
                
        groups = self._get_episode_data(self.readJson(self.dataset_path))
        data = []
        for eid, items in tqdm(groups.items()):
            tamp = []
            action_traslate = [actionMapping(step['actions_label']) for step in items]
            for idx, step in enumerate(items):
                from copy import deepcopy
                record = deepcopy(sample)
                if None in action_traslate:
                    break
                if self.dataset_type == 'mobile':
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                elif self.dataset_type == 'web_single':
                    record['episode_id'] = step['uid'].split("_qa_")[0]
                    record['step_id'] = step['uid'].split("_qa_")[1]
                    if step['actions_label'][0]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][0]['element']['related'])[0]
                        record['bbox'] = [item * 1000 for item in bbox]
                    else: 
                        None
                else:
                    record['episode_id'] = step['uid'].split("_step_")[0]
                    record['step_id'] = step['uid'].split("_step_")[1]
                    if step['actions_label'][-1]['name'] == 'click':
                        bbox = self._extract_coordinates(step['actions_label'][-1]['element']['related'])[0]
                        record['bbox'] = [item * 1000 for item in bbox]
                    else: 
                        None
                record['images'] = [os.path.join(self.image_path, step['image_id']+'.png')]
                record['goal'] = step['question']
                record['label'] = action_traslate[idx]
                record['image_size'] = [[step['image_size']['width'], step['image_size']['height']]]
                dummy_image = getResizedImage(record['images'][0])
                system_messages = build_system_messages(dummy_image.height, dummy_image.width)
                history = [(int(match[0]), match[1]) for match in re.findall(r"step (\d+): (\w+)", step["actions_history"])]
                user_messages = build_user_messages(record['goal'], enable_think=True, history=history)

                user_messages['content'].append({"image": record['images'][0]})
                messages = [system_messages, user_messages]
                record['messages'] = message_translate(messages, to_format='qwen')
              
                tamp.append(record)
            data.extend(tamp)
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        self.saveJson(data, os.path.join(self.save_path, self.dataset_type+"_"+self.model_name.lower()+'.json'))
        logger.info("Finished")
            
              


    def _get_episode_data(self, data):
        from collections import defaultdict
        groups = defaultdict(list)

        for item in data:
            episode_id = item["uid"].split("_step_")[0]  
            groups[episode_id].append(item)
        return groups
    
    def _extract_coordinates(self, action):
        if self.dataset_type == 'mobile':        
            match = re.search(r"<point>([\d.]+),\s*([\d.]+)</point>", action)
            if match:
                x, y = map(float, match.groups())
                return x, y
            else:
                return None
        else:
            values = re.findall(r"[-+]?\d*\.?\d+", action)
            bbox = list(map(float, values))
            x1, y1, x2, y2 = bbox
            cx, cy = (x1 + x2) / 2, (y1 + y2) / 2
            return [x1, y1, x2, y2], [cx, cy]
        
if __name__ == '__main__':
    args = parse_args()
    logger.info(args)
    process = GUIActPreProcess(args.dataset_type, args.dataset_path, args.dataset_name, args.save_path, args.model_name)
    if args.model_name == "OS_ATLAS":
        process.OS_ATLAS()
    elif args.model_name == "UI_TARS" or args.model_name == "UI_TARS_1.5":
        process.UI_TARS()
    elif args.model_name == 'GUI_R1':
        process.GUI_R1()
    elif args.model_name == 'Agent_CPM':
        process.Agent_CPM()
    elif args.model_name == 'OS_Genesis':
        process.OS_Genesis()
    elif args.model_name == 'Aguvis':
        process.Aguvis()
    elif args.model_name == 'GUI_Odyssey':
        process.GUI_Odyssey()
    elif args.model_name == 'GUI_OWL':
        process.GUI_OWL()
    else:
        logger.info("error processing")

    
  
        
        